From f91cf5979becb9058e1b94f39f78039b80fb9244 Mon Sep 17 00:00:00 2001
From: "cl349@freefall.cl.cam.ac.uk" <cl349@freefall.cl.cam.ac.uk>
Date: Fri, 30 Jul 2004 18:48:57 +0000
Subject: [PATCH] bitkeeper revision 1.1108.44.1
 (410a9819I8nRg4cqB44agt4G5KysGA)

add network backend driver for 2.6
---
 .rootkeys                                     |   12 +-
 linux-2.4.26-xen-sparse/mkbuildtree           |    2 +-
 .../arch/xen/i386/mm/hypervisor.c             |   97 +-
 linux-2.6.7-xen-sparse/drivers/xen/Makefile   |    4 +-
 .../drivers/xen/net/Makefile                  |    2 -
 .../drivers/xen/netback/Makefile              |    2 +
 .../drivers/xen/netback/common.h              |   97 +
 .../drivers/xen/netback/control.c             |   65 +
 .../drivers/xen/netback/interface.c           |  288 +++
 .../drivers/xen/netback/netback.c             |  778 +++++++
 .../drivers/xen/{net => netfront}/Kconfig     |    0
 .../drivers/xen/netfront/Makefile             |    2 +
 .../{net/network.c => netfront/netfront.c}    |    0
 .../include/asm-xen/asm-i386/hypervisor.h     |   15 +-
 linux-2.6.7-xen-sparse/mm/page_alloc.c        | 2017 +++++++++++++++++
 15 files changed, 3369 insertions(+), 12 deletions(-)
 delete mode 100644 linux-2.6.7-xen-sparse/drivers/xen/net/Makefile
 create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile
 create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netback/common.h
 create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netback/control.c
 create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c
 create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c
 rename linux-2.6.7-xen-sparse/drivers/xen/{net => netfront}/Kconfig (100%)
 create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile
 rename linux-2.6.7-xen-sparse/drivers/xen/{net/network.c => netfront/netfront.c} (100%)
 create mode 100644 linux-2.6.7-xen-sparse/mm/page_alloc.c

diff --git a/.rootkeys b/.rootkeys
index 12babc83d8..b35246e7c7 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -203,9 +203,14 @@
 3e5a4e651TH-SXHoufurnWjgl5bfOA linux-2.6.7-xen-sparse/drivers/xen/console/console.c
 40f56239KYxO0YabhPzCTeUuln-lnA linux-2.6.7-xen-sparse/drivers/xen/evtchn/Makefile
 40f56239DoibTX6R-ZYd3QTXAB8_TA linux-2.6.7-xen-sparse/drivers/xen/evtchn/evtchn.c
-40f56239lrg_Ob0BJ8WBFS1zeg2CYw linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig
-40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6.7-xen-sparse/drivers/xen/net/Makefile
-405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.7-xen-sparse/drivers/xen/net/network.c
+410a9817HEVJvred5Oy_uKH3HFJC5Q linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile
+4097ba831lpGeLlPg-bfV8XarVVuoQ linux-2.6.7-xen-sparse/drivers/xen/netback/common.h
+4097ba83wvv8yi5P5xugCUBAdb6O-A linux-2.6.7-xen-sparse/drivers/xen/netback/control.c
+4097ba83byY5bTSugJGZ1exTxIcMKw linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c
+4087cf0dGmSbFhFZyIZBJzvqxY-qBw linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c
+40f56239lrg_Ob0BJ8WBFS1zeg2CYw linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig
+40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile
+405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c
 4108f5c1ppFXVpQzCOAZ6xXYubsjKA linux-2.6.7-xen-sparse/drivers/xen/privcmd/Makefile
 3e5a4e65IUfzzMu2kZFlGEB8-rpTaA linux-2.6.7-xen-sparse/drivers/xen/privcmd/privcmd.c
 40f56239YAjS52QG2FIAQpHDZAdGHg linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/desc.h
@@ -251,6 +256,7 @@
 40f5623cBiQhPHILVLrl3xa6bDBaRg linux-2.6.7-xen-sparse/include/asm-xen/xen.h
 3f689063BoW-HWV3auUJ-OqXfcGArw linux-2.6.7-xen-sparse/include/asm-xen/xen_proc.h
 40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.7-xen-sparse/mkbuildtree
+410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.7-xen-sparse/mm/page_alloc.c
 40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
 3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
 401d7e160vaxMBAUSLSicuZ7AQjJ3w tools/examples/Makefile
diff --git a/linux-2.4.26-xen-sparse/mkbuildtree b/linux-2.4.26-xen-sparse/mkbuildtree
index a2f3a2cd80..46b992a988 100755
--- a/linux-2.4.26-xen-sparse/mkbuildtree
+++ b/linux-2.4.26-xen-sparse/mkbuildtree
@@ -238,4 +238,4 @@ cd ${AD}/arch/xen/drivers/dom0
 ln -sf ../../../../${LINUX_26}/drivers/xen/privcmd/privcmd.c core.c
 
 cd ${AD}/arch/xen/drivers/netif/frontend
-ln -sf ../../../../../${LINUX_26}/drivers/xen/net/network.c main.c
+ln -sf ../../../../../${LINUX_26}/drivers/xen/netfront/netfront.c main.c
diff --git a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c
index 35e8c3e312..173a6a0891 100644
--- a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c
+++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c
@@ -8,6 +8,8 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
 #include <asm/hypervisor.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -126,8 +128,11 @@ static inline void __flush_page_update_queue(void)
 #endif
     idx = 0;
     wmb(); /* Make sure index is cleared first to avoid double updates. */
-    if (unlikely(HYPERVISOR_mmu_update(update_queue, _idx, NULL) < 0))
-	panic("Failed to execute MMU updates");
+    if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx, NULL) < 0) )
+    {
+        printk(KERN_ALERT "Failed to execute MMU updates.\n");
+        BUG();
+    }
 }
 
 void _flush_page_update_queue(void)
@@ -262,3 +267,91 @@ void queue_machphys_update(unsigned long mfn, unsigned long pfn)
     increment_index();
     spin_unlock_irqrestore(&update_lock, flags);
 }
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+
+unsigned long allocate_empty_lowmem_region(unsigned long pages)
+{
+    pgd_t         *pgd; 
+    pmd_t         *pmd;
+    pte_t         *pte;
+    unsigned long *pfn_array;
+    unsigned long  vstart;
+    unsigned long  i;
+    int            ret;
+    unsigned int   order = get_order(pages*PAGE_SIZE);
+
+    vstart = __get_free_pages(GFP_KERNEL, order);
+    if ( vstart == 0 )
+        return 0UL;
+
+    pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
+    if ( pfn_array == NULL )
+        BUG();
+
+    for ( i = 0; i < (1<<order); i++ )
+    {
+        pgd = pgd_offset_k(   (vstart + (i*PAGE_SIZE)));
+        pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE)));
+        pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 
+        pfn_array[i] = pte->pte_low >> PAGE_SHIFT;
+        queue_l1_entry_update(pte, 0);
+        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = 0xdeadbeef;
+    }
+
+    flush_page_update_queue();
+
+    ret = HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, 
+                                pfn_array, 1<<order);
+    if ( unlikely(ret != (1<<order)) )
+    {
+        printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
+        BUG();
+    }
+
+    vfree(pfn_array);
+
+    return vstart;
+}
+
+void deallocate_lowmem_region(unsigned long vstart, unsigned long pages)
+{
+    pgd_t         *pgd; 
+    pmd_t         *pmd;
+    pte_t         *pte;
+    unsigned long *pfn_array;
+    unsigned long  i;
+    int            ret;
+    unsigned int   order = get_order(pages*PAGE_SIZE);
+
+    pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
+    if ( pfn_array == NULL )
+        BUG();
+
+    ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation,
+                                pfn_array, 1<<order);
+    if ( unlikely(ret != (1<<order)) )
+    {
+        printk(KERN_WARNING "Unable to increase memory reservation (%d)\n",
+               ret);
+        BUG();
+    }
+
+    for ( i = 0; i < (1<<order); i++ )
+    {
+        pgd = pgd_offset_k(   (vstart + (i*PAGE_SIZE)));
+        pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE)));
+        pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
+        queue_l1_entry_update(pte, (pfn_array[i]<<PAGE_SHIFT)|__PAGE_KERNEL);
+        queue_machphys_update(pfn_array[i], __pa(vstart)>>PAGE_SHIFT);
+        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = pfn_array[i];
+    }
+
+    flush_page_update_queue();
+
+    vfree(pfn_array);
+
+    free_pages(vstart, order);
+}
+
+#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/Makefile
index 2b3219fd4e..ae06ee79a7 100644
--- a/linux-2.6.7-xen-sparse/drivers/xen/Makefile
+++ b/linux-2.6.7-xen-sparse/drivers/xen/Makefile
@@ -3,5 +3,7 @@
 obj-y	+= block/
 obj-y	+= console/
 obj-y	+= evtchn/
-obj-y	+= net/
+obj-y	+= netfront/
 obj-y	+= privcmd/
+
+obj-$(CONFIG_XEN_PHYSDEV_ACCESS)	+= netback/
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/net/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/net/Makefile
deleted file mode 100644
index 694dd834f0..0000000000
--- a/linux-2.6.7-xen-sparse/drivers/xen/net/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-
-obj-y	:= network.o
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile
new file mode 100644
index 0000000000..3279442145
--- /dev/null
+++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile
@@ -0,0 +1,2 @@
+
+obj-y	:= netback.o control.o interface.o
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/common.h b/linux-2.6.7-xen-sparse/drivers/xen/netback/common.h
new file mode 100644
index 0000000000..b977dfff39
--- /dev/null
+++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/common.h
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/common.h
+ */
+
+#ifndef __NETIF__BACKEND__COMMON_H__
+#define __NETIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <asm-xen/ctrl_if.h>
+#include <asm/io.h>
+#include <asm/pgalloc.h>
+#include <asm-xen/netif.h>
+
+#if 0
+#define ASSERT(_p) \
+    if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+                           __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+typedef struct netif_st {
+    /* Unique identifier for this interface. */
+    domid_t          domid;
+    unsigned int     handle;
+
+    /* Physical parameters of the comms window. */
+    unsigned long    tx_shmem_frame;
+    unsigned long    rx_shmem_frame;
+    unsigned int     evtchn;
+    int              irq;
+
+    /* The shared rings and indexes. */
+    netif_tx_interface_t *tx;
+    netif_rx_interface_t *rx;
+
+    /* Private indexes into shared ring. */
+    NETIF_RING_IDX rx_req_cons;
+    NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */
+    NETIF_RING_IDX tx_req_cons;
+    NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */
+
+    /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+    unsigned long   credit_bytes;
+    unsigned long   credit_usec;
+    unsigned long   remaining_credit;
+    struct timer_list credit_timeout;
+
+    /* Miscellaneous private stuff. */
+    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+    /*
+     * DISCONNECT response is deferred until pending requests are ack'ed.
+     * We therefore need to store the id from the original request.
+     */
+    u8               disconnect_rspid;
+    struct netif_st *hash_next;
+    struct list_head list;  /* scheduling list */
+    atomic_t         refcnt;
+    spinlock_t       rx_lock, tx_lock;
+    struct net_device *dev;
+    struct net_device_stats stats;
+} netif_t;
+
+void netif_create(netif_be_create_t *create);
+void netif_destroy(netif_be_destroy_t *destroy);
+void netif_connect(netif_be_connect_t *connect);
+int  netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id);
+void __netif_disconnect_complete(netif_t *netif);
+netif_t *netif_find_by_handle(domid_t domid, unsigned int handle);
+#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define netif_put(_b)                             \
+    do {                                          \
+        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+            __netif_disconnect_complete(_b);      \
+    } while (0)
+
+void netif_interface_init(void);
+void netif_ctrlif_init(void);
+
+void netif_deschedule(netif_t *netif);
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
+struct net_device_stats *netif_be_get_stats(struct net_device *dev);
+irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __NETIF__BACKEND__COMMON_H__ */
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/control.c b/linux-2.6.7-xen-sparse/drivers/xen/netback/control.c
new file mode 100644
index 0000000000..cf1b075031
--- /dev/null
+++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/control.c
@@ -0,0 +1,65 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/control.c
+ * 
+ * Routines for interfacing with the control plane.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    switch ( msg->subtype )
+    {
+    case CMSG_NETIF_BE_CREATE:
+        if ( msg->length != sizeof(netif_be_create_t) )
+            goto parse_error;
+        netif_create((netif_be_create_t *)&msg->msg[0]);
+        break;        
+    case CMSG_NETIF_BE_DESTROY:
+        if ( msg->length != sizeof(netif_be_destroy_t) )
+            goto parse_error;
+        netif_destroy((netif_be_destroy_t *)&msg->msg[0]);
+        break;        
+    case CMSG_NETIF_BE_CONNECT:
+        if ( msg->length != sizeof(netif_be_connect_t) )
+            goto parse_error;
+        netif_connect((netif_be_connect_t *)&msg->msg[0]);
+        break;        
+    case CMSG_NETIF_BE_DISCONNECT:
+        if ( msg->length != sizeof(netif_be_disconnect_t) )
+            goto parse_error;
+        if ( !netif_disconnect((netif_be_disconnect_t *)&msg->msg[0],msg->id) )
+            return; /* Sending the response is deferred until later. */
+        break;        
+    default:
+        goto parse_error;
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    DPRINTK("Parse error while reading message subtype %d, len %d\n",
+            msg->subtype, msg->length);
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+void netif_ctrlif_init(void)
+{
+    ctrl_msg_t                       cmsg;
+    netif_be_driver_status_changed_t st;
+
+    (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_NETIF_BE;
+    cmsg.subtype   = CMSG_NETIF_BE_DRIVER_STATUS_CHANGED;
+    cmsg.length    = sizeof(netif_be_driver_status_changed_t);
+    st.status      = NETIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &st, sizeof(st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c b/linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c
new file mode 100644
index 0000000000..01447b1de8
--- /dev/null
+++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c
@@ -0,0 +1,288 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/interface.c
+ * 
+ * Network-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+#include <linux/rtnetlink.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define VMALLOC_VMADDR(x) ((unsigned long)(x))
+#endif
+
+#define NETIF_HASHSZ 1024
+#define NETIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(NETIF_HASHSZ-1))
+
+static netif_t *netif_hash[NETIF_HASHSZ];
+
+netif_t *netif_find_by_handle(domid_t domid, unsigned int handle)
+{
+    netif_t *netif = netif_hash[NETIF_HASH(domid, handle)];
+    while ( (netif != NULL) && 
+            ((netif->domid != domid) || (netif->handle != handle)) )
+        netif = netif->hash_next;
+    return netif;
+}
+
+void __netif_disconnect_complete(netif_t *netif)
+{
+    ctrl_msg_t            cmsg;
+    netif_be_disconnect_t disc;
+
+    /*
+     * These can't be done in __netif_disconnect() because at that point there
+     * may be outstanding requests at the disc whose asynchronous responses
+     * must still be notified to the remote driver.
+     */
+    unbind_evtchn_from_irq(netif->evtchn);
+    vfree(netif->tx); /* Frees netif->rx as well. */
+    rtnl_lock();
+    (void)dev_close(netif->dev);
+    rtnl_unlock();
+
+    /* Construct the deferred response message. */
+    cmsg.type         = CMSG_NETIF_BE;
+    cmsg.subtype      = CMSG_NETIF_BE_DISCONNECT;
+    cmsg.id           = netif->disconnect_rspid;
+    cmsg.length       = sizeof(netif_be_disconnect_t);
+    disc.domid        = netif->domid;
+    disc.netif_handle = netif->handle;
+    disc.status       = NETIF_BE_STATUS_OKAY;
+    memcpy(cmsg.msg, &disc, sizeof(disc));
+
+    /*
+     * Make sure message is constructed /before/ status change, because
+     * after the status change the 'netif' structure could be deallocated at
+     * any time. Also make sure we send the response /after/ status change,
+     * as otherwise a subsequent CONNECT request could spuriously fail if
+     * another CPU doesn't see the status change yet.
+     */
+    mb();
+    if ( netif->status != DISCONNECTING )
+        BUG();
+    netif->status = DISCONNECTED;
+    mb();
+
+    /* Send the successful response. */
+    ctrl_if_send_response(&cmsg);
+}
+
+void netif_create(netif_be_create_t *create)
+{
+    int                err = 0;
+    domid_t            domid  = create->domid;
+    unsigned int       handle = create->netif_handle;
+    struct net_device *dev;
+    netif_t          **pnetif, *netif;
+    char               name[IFNAMSIZ] = {};
+
+    snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
+    dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
+    if ( dev == NULL )
+    {
+        DPRINTK("Could not create netif: out of memory\n");
+        create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    netif = dev->priv;
+    memset(netif, 0, sizeof(*netif));
+    netif->domid  = domid;
+    netif->handle = handle;
+    netif->status = DISCONNECTED;
+    spin_lock_init(&netif->rx_lock);
+    spin_lock_init(&netif->tx_lock);
+    atomic_set(&netif->refcnt, 0);
+    netif->dev = dev;
+
+    netif->credit_bytes = netif->remaining_credit = ~0UL;
+    netif->credit_usec  = 0UL;
+    /*init_ac_timer(&new_vif->credit_timeout);*/
+
+    pnetif = &netif_hash[NETIF_HASH(domid, handle)];
+    while ( *pnetif != NULL )
+    {
+        if ( ((*pnetif)->domid == domid) && ((*pnetif)->handle == handle) )
+        {
+            DPRINTK("Could not create netif: already exists\n");
+            create->status = NETIF_BE_STATUS_INTERFACE_EXISTS;
+            kfree(dev);
+            return;
+        }
+        pnetif = &(*pnetif)->hash_next;
+    }
+
+    dev->hard_start_xmit = netif_be_start_xmit;
+    dev->get_stats       = netif_be_get_stats;
+    memcpy(dev->dev_addr, create->mac, ETH_ALEN);
+
+    /* Disable queuing. */
+    dev->tx_queue_len = 0;
+
+    /* Force a different MAC from remote end. */
+    dev->dev_addr[2] ^= 1;
+
+    if ( (err = register_netdev(dev)) != 0 )
+    {
+        DPRINTK("Could not register new net device %s: err=%d\n",
+                dev->name, err);
+        create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        kfree(dev);
+        return;
+    }
+
+    netif->hash_next = *pnetif;
+    *pnetif = netif;
+
+    DPRINTK("Successfully created netif\n");
+    create->status = NETIF_BE_STATUS_OKAY;
+}
+
+void netif_destroy(netif_be_destroy_t *destroy)
+{
+    domid_t       domid  = destroy->domid;
+    unsigned int  handle = destroy->netif_handle;
+    netif_t     **pnetif, *netif;
+
+    pnetif = &netif_hash[NETIF_HASH(domid, handle)];
+    while ( (netif = *pnetif) != NULL )
+    {
+        if ( (netif->domid == domid) && (netif->handle == handle) )
+        {
+            if ( netif->status != DISCONNECTED )
+                goto still_connected;
+            goto destroy;
+        }
+        pnetif = &netif->hash_next;
+    }
+
+    destroy->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+    return;
+
+ still_connected:
+    destroy->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
+    return;
+
+ destroy:
+    *pnetif = netif->hash_next;
+    unregister_netdev(netif->dev);
+    kfree(netif->dev);
+    destroy->status = NETIF_BE_STATUS_OKAY;
+}
+
+void netif_connect(netif_be_connect_t *connect)
+{
+    domid_t       domid  = connect->domid;
+    unsigned int  handle = connect->netif_handle;
+    unsigned int  evtchn = connect->evtchn;
+    unsigned long tx_shmem_frame = connect->tx_shmem_frame;
+    unsigned long rx_shmem_frame = connect->rx_shmem_frame;
+    struct vm_struct *vma;
+    pgprot_t      prot;
+    int           error;
+    netif_t      *netif;
+#if 0
+    struct net_device *eth0_dev;
+#endif
+
+    netif = netif_find_by_handle(domid, handle);
+    if ( unlikely(netif == NULL) )
+    {
+        DPRINTK("netif_connect attempted for non-existent netif (%u,%u)\n", 
+                connect->domid, connect->netif_handle); 
+        connect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    if ( netif->status != DISCONNECTED )
+    {
+        connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
+        return;
+    }
+
+    if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL )
+    {
+        connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+    error  = direct_remap_area_pages(&init_mm, 
+                                     VMALLOC_VMADDR(vma->addr),
+                                     tx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                     prot, domid);
+    error |= direct_remap_area_pages(&init_mm, 
+                                     VMALLOC_VMADDR(vma->addr) + PAGE_SIZE,
+                                     rx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                     prot, domid);
+    if ( error != 0 )
+    {
+        if ( error == -ENOMEM )
+            connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        else if ( error == -EFAULT )
+            connect->status = NETIF_BE_STATUS_MAPPING_ERROR;
+        else
+            connect->status = NETIF_BE_STATUS_ERROR;
+        vfree(vma->addr);
+        return;
+    }
+
+    netif->evtchn         = evtchn;
+    netif->irq            = bind_evtchn_to_irq(evtchn);
+    netif->tx_shmem_frame = tx_shmem_frame;
+    netif->rx_shmem_frame = rx_shmem_frame;
+    netif->tx             = 
+        (netif_tx_interface_t *)vma->addr;
+    netif->rx             = 
+        (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE);
+    netif->status         = CONNECTED;
+    netif_get(netif);
+
+    rtnl_lock();
+    (void)dev_open(netif->dev);
+    rtnl_unlock();
+
+    (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif);
+    netif_start_queue(netif->dev);
+
+    connect->status = NETIF_BE_STATUS_OKAY;
+}
+
+int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id)
+{
+    domid_t       domid  = disconnect->domid;
+    unsigned int  handle = disconnect->netif_handle;
+    netif_t      *netif;
+
+    netif = netif_find_by_handle(domid, handle);
+    if ( unlikely(netif == NULL) )
+    {
+        DPRINTK("netif_disconnect attempted for non-existent netif"
+                " (%u,%u)\n", disconnect->domid, disconnect->netif_handle); 
+        disconnect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return 1; /* Caller will send response error message. */
+    }
+
+    if ( netif->status == CONNECTED )
+    {
+        netif->status = DISCONNECTING;
+        netif->disconnect_rspid = rsp_id;
+        wmb(); /* Let other CPUs see the status change. */
+        netif_stop_queue(netif->dev);
+        free_irq(netif->irq, netif);
+        netif_deschedule(netif);
+        netif_put(netif);
+        return 0; /* Caller should not send response message. */
+    }
+
+    disconnect->status = NETIF_BE_STATUS_OKAY;
+    return 1;
+}
+
+void netif_interface_init(void)
+{
+    memset(netif_hash, 0, sizeof(netif_hash));
+}
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c b/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c
new file mode 100644
index 0000000000..c0381048d8
--- /dev/null
+++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c
@@ -0,0 +1,778 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/main.c
+ * 
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A 
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/netif/frontend
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ */
+
+#include "common.h"
+
+static void netif_page_release(struct page *page);
+static void make_tx_response(netif_t *netif, 
+                             u16      id,
+                             s8       st);
+static int  make_rx_response(netif_t *netif, 
+                             u16      id, 
+                             s8       st,
+                             memory_t addr,
+                             u16      size);
+
+static void net_tx_action(unsigned long unused);
+static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
+
+static void net_rx_action(unsigned long unused);
+static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
+
+static struct sk_buff_head rx_queue;
+static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2];
+static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE*3];
+static unsigned char rx_notify[NR_EVENT_CHANNELS];
+
+/* Don't currently gate addition of an interface to the tx scheduling list. */
+#define tx_work_exists(_if) (1)
+
+#define MAX_PENDING_REQS 256
+static unsigned long mmap_vstart;
+#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
+
+#define PKT_PROT_LEN (ETH_HLEN + 20)
+
+static struct {
+    netif_tx_request_t req;
+    netif_t *netif;
+} pending_tx_info[MAX_PENDING_REQS];
+static u16 pending_ring[MAX_PENDING_REQS];
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+/* Freed TX SKBs get batched on this ring before return to pending_ring. */
+static u16 dealloc_ring[MAX_PENDING_REQS];
+static spinlock_t dealloc_lock = SPIN_LOCK_UNLOCKED;
+static PEND_RING_IDX dealloc_prod, dealloc_cons;
+
+static struct sk_buff_head tx_queue;
+static multicall_entry_t tx_mcl[MAX_PENDING_REQS];
+
+static struct list_head net_schedule_list;
+static spinlock_t net_schedule_list_lock;
+
+#define MAX_MFN_ALLOC 64
+static unsigned long mfn_list[MAX_MFN_ALLOC];
+static unsigned int alloc_index = 0;
+static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED;
+
+static void __refresh_mfn_list(void)
+{
+    int ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation,
+                                    mfn_list, MAX_MFN_ALLOC);
+    if ( unlikely(ret != MAX_MFN_ALLOC) )
+        BUG();
+    alloc_index = MAX_MFN_ALLOC;
+}
+
+static unsigned long get_new_mfn(void)
+{
+    unsigned long mfn, flags;
+    spin_lock_irqsave(&mfn_lock, flags);
+    if ( alloc_index == 0 )
+        __refresh_mfn_list();
+    mfn = mfn_list[--alloc_index];
+    spin_unlock_irqrestore(&mfn_lock, flags);
+    return mfn;
+}
+
+static void dealloc_mfn(unsigned long mfn)
+{
+    unsigned long flags;
+    spin_lock_irqsave(&mfn_lock, flags);
+    if ( alloc_index != MAX_MFN_ALLOC )
+        mfn_list[alloc_index++] = mfn;
+    else if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, &mfn, 1) != 1 )
+        BUG();
+    spin_unlock_irqrestore(&mfn_lock, flags);
+}
+
+static inline void maybe_schedule_tx_action(void)
+{
+    smp_mb();
+    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+         !list_empty(&net_schedule_list) )
+        tasklet_schedule(&net_tx_tasklet);
+}
+
+/*
+ * This is the primary RECEIVE function for a network interface.
+ * Note that, from the p.o.v. of /this/ OS it looks like a transmit.
+ */
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+    netif_t *netif = (netif_t *)dev->priv;
+
+    ASSERT(skb->dev == dev);
+
+    /* Drop the packet if the target domain has no receive buffers. */
+    if ( (netif->rx_req_cons == netif->rx->req_prod) ||
+         ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) )
+        goto drop;
+
+    /*
+     * We do not copy the packet unless:
+     *  1. The data is shared; or
+     *  2. It spans a page boundary; or
+     *  3. We cannot be sure the whole data page is allocated.
+     * The copying method is taken from skb_copy().
+     * NB. We also couldn't cope with fragmented packets, but we won't get
+     *     any because we not advertise the NETIF_F_SG feature.
+     */
+    if ( skb_shared(skb) || skb_cloned(skb) || 
+         (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) ||
+         ((skb->end - skb->head) < (PAGE_SIZE/2)) )
+    {
+        struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC);
+        int hlen = skb->data - skb->head;
+        if ( unlikely(nskb == NULL) )
+            goto drop;
+        skb_reserve(nskb, hlen);
+        __skb_put(nskb, skb->len);
+        (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len);
+        nskb->dev = skb->dev;
+        dev_kfree_skb(skb);
+        skb = nskb;
+    }
+
+    netif->rx_req_cons++;
+
+    skb_queue_tail(&rx_queue, skb);
+    tasklet_schedule(&net_rx_tasklet);
+
+    return 0;
+
+ drop:
+    netif->stats.rx_dropped++;
+    dev_kfree_skb(skb);
+    return 0;
+}
+
+#if 0
+static void xen_network_done_notify(void)
+{
+    static struct net_device *eth0_dev = NULL;
+    if ( unlikely(eth0_dev == NULL) )
+        eth0_dev = __dev_get_by_name("eth0");
+    netif_rx_schedule(eth0_dev);
+}
+/* 
+ * Add following to poll() function in NAPI driver (Tigon3 is example):
+ *  if ( xen_network_done() )
+ *      tg3_enable_ints(tp); 
+ */
+int xen_network_done(void)
+{
+    return skb_queue_empty(&rx_queue);
+}
+#endif
+
+static void net_rx_action(unsigned long unused)
+{
+    netif_t *netif;
+    s8 status;
+    u16 size, id, evtchn;
+    mmu_update_t *mmu;
+    multicall_entry_t *mcl;
+    unsigned long vdata, mdata, new_mfn;
+    struct sk_buff_head rxq;
+    struct sk_buff *skb;
+    u16 notify_list[NETIF_RX_RING_SIZE];
+    int notify_nr = 0;
+
+    skb_queue_head_init(&rxq);
+
+    mcl = rx_mcl;
+    mmu = rx_mmu;
+    while ( (skb = skb_dequeue(&rx_queue)) != NULL )
+    {
+        netif   = (netif_t *)skb->dev->priv;
+        vdata   = (unsigned long)skb->data;
+        mdata   = virt_to_machine(vdata);
+        new_mfn = get_new_mfn();
+        
+        mmu[0].ptr  = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+        mmu[0].val  = __pa(vdata) >> PAGE_SHIFT;        
+        mmu[1].val  = (unsigned long)(netif->domid<<16) & ~0xFFFFUL;
+        mmu[1].ptr  = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL;
+        mmu[1].ptr |= MMU_EXTENDED_COMMAND;
+        mmu[1].val |= MMUEXT_SET_SUBJECTDOM;
+        mmu[2].ptr  = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
+        mmu[2].val  = MMUEXT_REASSIGN_PAGE;
+
+        mcl[0].op = __HYPERVISOR_update_va_mapping;
+        mcl[0].args[0] = vdata >> PAGE_SHIFT;
+        mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
+        mcl[0].args[2] = 0;
+        mcl[1].op = __HYPERVISOR_mmu_update;
+        mcl[1].args[0] = (unsigned long)mmu;
+        mcl[1].args[1] = 3;
+        mcl[1].args[2] = 0;
+
+        mcl += 2;
+        mmu += 3;
+
+        __skb_queue_tail(&rxq, skb);
+
+        /* Filled the batch queue? */
+        if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) )
+            break;
+    }
+
+    if ( mcl == rx_mcl )
+        return;
+
+    mcl[-2].args[2] = UVMF_FLUSH_TLB;
+    if ( unlikely(HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl) != 0) )
+        BUG();
+
+    mcl = rx_mcl;
+    mmu = rx_mmu;
+    while ( (skb = __skb_dequeue(&rxq)) != NULL )
+    {
+        netif   = (netif_t *)skb->dev->priv;
+        size    = skb->tail - skb->data;
+
+        /* Rederive the machine addresses. */
+        new_mfn = mcl[0].args[1] >> PAGE_SHIFT;
+        mdata   = ((mmu[2].ptr & PAGE_MASK) |
+                   ((unsigned long)skb->data & ~PAGE_MASK));
+        
+        phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn;
+        
+        atomic_set(&(skb_shinfo(skb)->dataref), 1);
+        skb_shinfo(skb)->nr_frags = 0;
+        skb_shinfo(skb)->frag_list = NULL;
+
+        netif->stats.rx_bytes += size;
+        netif->stats.rx_packets++;
+
+        /* The update_va_mapping() must not fail. */
+        if ( unlikely(mcl[0].args[5] != 0) )
+            BUG();
+
+        /* Check the reassignment error code. */
+        status = NETIF_RSP_OKAY;
+        if ( unlikely(mcl[1].args[5] != 0) )
+        {
+            DPRINTK("Failed MMU update transferring to DOM%u\n", netif->domid);
+            dealloc_mfn(mdata >> PAGE_SHIFT);
+            status = NETIF_RSP_ERROR;
+        }
+
+        evtchn = netif->evtchn;
+        id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id;
+        if ( make_rx_response(netif, id, status, mdata, size) &&
+             (rx_notify[evtchn] == 0) )
+        {
+            rx_notify[evtchn] = 1;
+            notify_list[notify_nr++] = evtchn;
+        }
+
+        dev_kfree_skb(skb);
+
+        mcl += 2;
+        mmu += 3;
+    }
+
+    while ( notify_nr != 0 )
+    {
+        evtchn = notify_list[--notify_nr];
+        rx_notify[evtchn] = 0;
+        notify_via_evtchn(evtchn);
+    }
+
+    /* More work to do? */
+    if ( !skb_queue_empty(&rx_queue) )
+        tasklet_schedule(&net_rx_tasklet);
+#if 0
+    else
+        xen_network_done_notify();
+#endif
+}
+
+struct net_device_stats *netif_be_get_stats(struct net_device *dev)
+{
+    netif_t *netif = dev->priv;
+    return &netif->stats;
+}
+
+static int __on_net_schedule_list(netif_t *netif)
+{
+    return netif->list.next != NULL;
+}
+
+static void remove_from_net_schedule_list(netif_t *netif)
+{
+    spin_lock_irq(&net_schedule_list_lock);
+    if ( likely(__on_net_schedule_list(netif)) )
+    {
+        list_del(&netif->list);
+        netif->list.next = NULL;
+        netif_put(netif);
+    }
+    spin_unlock_irq(&net_schedule_list_lock);
+}
+
+static void add_to_net_schedule_list_tail(netif_t *netif)
+{
+    if ( __on_net_schedule_list(netif) )
+        return;
+
+    spin_lock_irq(&net_schedule_list_lock);
+    if ( !__on_net_schedule_list(netif) && (netif->status == CONNECTED) )
+    {
+        list_add_tail(&netif->list, &net_schedule_list);
+        netif_get(netif);
+    }
+    spin_unlock_irq(&net_schedule_list_lock);
+}
+
+static inline void netif_schedule_work(netif_t *netif)
+{
+    if ( (netif->tx_req_cons != netif->tx->req_prod) &&
+         ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
+    {
+        add_to_net_schedule_list_tail(netif);
+        maybe_schedule_tx_action();
+    }
+}
+
+void netif_deschedule(netif_t *netif)
+{
+    remove_from_net_schedule_list(netif);
+}
+
+#if 0
+static void tx_credit_callback(unsigned long data)
+{
+    netif_t *netif = (netif_t *)data;
+    netif->remaining_credit = netif->credit_bytes;
+    netif_schedule_work(netif);
+}
+#endif
+
+static void net_tx_action(unsigned long unused)
+{
+    struct list_head *ent;
+    struct sk_buff *skb;
+    netif_t *netif;
+    netif_tx_request_t txreq;
+    u16 pending_idx;
+    NETIF_RING_IDX i;
+    struct page *page;
+    multicall_entry_t *mcl;
+    PEND_RING_IDX dc, dp;
+
+    if ( (dc = dealloc_cons) == (dp = dealloc_prod) )
+        goto skip_dealloc;
+
+    mcl = tx_mcl;
+    while ( dc != dp )
+    {
+        pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
+        mcl[0].op = __HYPERVISOR_update_va_mapping;
+        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
+        mcl[0].args[1] = 0;
+        mcl[0].args[2] = 0;
+        mcl++;        
+    }
+
+    mcl[-1].args[2] = UVMF_FLUSH_TLB;
+    if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) )
+        BUG();
+
+    mcl = tx_mcl;
+    while ( dealloc_cons != dp )
+    {
+        /* The update_va_mapping() must not fail. */
+        if ( unlikely(mcl[0].args[5] != 0) )
+            BUG();
+
+        pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
+
+        netif = pending_tx_info[pending_idx].netif;
+
+        spin_lock(&netif->tx_lock);
+        make_tx_response(netif, pending_tx_info[pending_idx].req.id, 
+                         NETIF_RSP_OKAY);
+        spin_unlock(&netif->tx_lock);
+        
+        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+        
+        /*
+         * Scheduling checks must happen after the above response is posted.
+         * This avoids a possible race with a guest OS on another CPU.
+         */
+        mb();
+        if ( (netif->tx_req_cons != netif->tx->req_prod) &&
+             ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
+            add_to_net_schedule_list_tail(netif);
+        
+        netif_put(netif);
+
+        mcl++;
+    }
+
+ skip_dealloc:
+    mcl = tx_mcl;
+    while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+            !list_empty(&net_schedule_list) )
+    {
+        /* Get a netif from the list with work to do. */
+        ent = net_schedule_list.next;
+        netif = list_entry(ent, netif_t, list);
+        netif_get(netif);
+        remove_from_net_schedule_list(netif);
+
+        /* Work to do? */
+        i = netif->tx_req_cons;
+        if ( (i == netif->tx->req_prod) ||
+             ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) )
+        {
+            netif_put(netif);
+            continue;
+        }
+        memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, 
+               sizeof(txreq));
+        netif->tx_req_cons++;
+
+#if 0
+        /* Credit-based scheduling. */
+        if ( tx.size > netif->remaining_credit )
+        {
+            s_time_t now = NOW(), next_credit = 
+                netif->credit_timeout.expires + MICROSECS(netif->credit_usec);
+            if ( next_credit <= now )
+            {
+                netif->credit_timeout.expires = now;
+                netif->remaining_credit = netif->credit_bytes;
+            }
+            else
+            {
+                netif->remaining_credit = 0;
+                netif->credit_timeout.expires  = next_credit;
+                netif->credit_timeout.data     = (unsigned long)netif;
+                netif->credit_timeout.function = tx_credit_callback;
+                netif->credit_timeout.cpu      = smp_processor_id();
+                add_ac_timer(&netif->credit_timeout);
+                break;
+            }
+        }
+        netif->remaining_credit -= tx.size;
+#endif
+
+        netif_schedule_work(netif);
+
+        if ( unlikely(txreq.size <= PKT_PROT_LEN) || 
+             unlikely(txreq.size > ETH_FRAME_LEN) )
+        {
+            DPRINTK("Bad packet size: %d\n", txreq.size);
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            continue; 
+        }
+
+        /* No crossing a page boundary as the payload mustn't fragment. */
+        if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) 
+        {
+            DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", 
+                    txreq.addr, txreq.size, 
+                    (txreq.addr &~PAGE_MASK) + txreq.size);
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            continue;
+        }
+
+        pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+
+        if ( unlikely((skb = alloc_skb(PKT_PROT_LEN+16, GFP_ATOMIC)) == NULL) )
+        {
+            DPRINTK("Can't allocate a skb in start_xmit.\n");
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            break;
+        }
+
+        /* Packets passed to netif_rx() must have some headroom. */
+        skb_reserve(skb, 16);
+
+        mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain;
+        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
+        mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL;
+        mcl[0].args[2] = 0;
+        mcl[0].args[3] = netif->domid;
+        mcl++;
+
+        memcpy(&pending_tx_info[pending_idx].req, &txreq, sizeof(txreq));
+        pending_tx_info[pending_idx].netif = netif;
+        *((u16 *)skb->data) = pending_idx;
+
+        __skb_queue_tail(&tx_queue, skb);
+
+        pending_cons++;
+
+        /* Filled the batch queue? */
+        if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) )
+            break;
+    }
+
+    if ( mcl == tx_mcl )
+        return;
+
+    if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) )
+        BUG();
+
+    mcl = tx_mcl;
+    while ( (skb = __skb_dequeue(&tx_queue)) != NULL )
+    {
+        pending_idx = *((u16 *)skb->data);
+        netif       = pending_tx_info[pending_idx].netif;
+        memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq));
+
+        /* Check the remap error code. */
+        if ( unlikely(mcl[0].args[5] != 0) )
+        {
+            DPRINTK("Bad page frame\n");
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            kfree_skb(skb);
+            mcl++;
+            pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+            continue;
+        }
+
+        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] =
+            txreq.addr >> PAGE_SHIFT;
+
+        __skb_put(skb, PKT_PROT_LEN);
+        memcpy(skb->data, 
+               (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
+               PKT_PROT_LEN);
+
+        page = virt_to_page(MMAP_VADDR(pending_idx));
+
+        /* Append the packet payload as a fragment. */
+        skb_shinfo(skb)->frags[0].page        = page;
+        skb_shinfo(skb)->frags[0].size        = txreq.size - PKT_PROT_LEN;
+        skb_shinfo(skb)->frags[0].page_offset = 
+            (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK;
+        skb_shinfo(skb)->nr_frags = 1;
+        skb->data_len  = txreq.size - PKT_PROT_LEN;
+        skb->len      += skb->data_len;
+
+        skb->dev      = netif->dev;
+        skb->protocol = eth_type_trans(skb, skb->dev);
+
+        /*
+         * Destructor information. We hideously abuse the 'mapping' pointer,
+         * which isn't otherwise used by us. The page deallocator is modified
+         * to interpret a non-NULL value as a destructor function to be called.
+         * This works okay because in all other cases the pointer must be NULL
+         * when the page is freed (normally Linux will explicitly bug out if
+         * it sees otherwise.
+         */
+        page->mapping = (struct address_space *)netif_page_release;
+        set_page_count(page, 1);
+
+        netif->stats.tx_bytes += txreq.size;
+        netif->stats.tx_packets++;
+
+        netif_rx(skb);
+        netif->dev->last_rx = jiffies;
+
+        mcl++;
+    }
+}
+
+static void netif_page_release(struct page *page)
+{
+    unsigned long flags;
+    u16 pending_idx = page - virt_to_page(mmap_vstart);
+
+    /* Stop the abuse. */
+    page->mapping = NULL;
+
+    spin_lock_irqsave(&dealloc_lock, flags);
+    dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
+    spin_unlock_irqrestore(&dealloc_lock, flags);
+
+    tasklet_schedule(&net_tx_tasklet);
+}
+
+#if 0
+long flush_bufs_for_netif(netif_t *netif)
+{
+    NETIF_RING_IDX i;
+
+    /* Return any outstanding receive buffers to the guest OS. */
+    spin_lock(&netif->rx_lock);
+    for ( i = netif->rx_req_cons; 
+          (i != netif->rx->req_prod) &&
+              ((i-netif->rx_resp_prod) != NETIF_RX_RING_SIZE);
+          i++ )
+    {
+        make_rx_response(netif,
+                         netif->rx->ring[MASK_NETIF_RX_IDX(i)].req.id,
+                         NETIF_RSP_DROPPED, 0, 0);
+    }
+    netif->rx_req_cons = i;
+    spin_unlock(&netif->rx_lock);
+
+    /*
+     * Flush pending transmit buffers. The guest may still have to wait for
+     * buffers that are queued at a physical NIC.
+     */
+    spin_lock(&netif->tx_lock);
+    for ( i = netif->tx_req_cons; 
+          (i != netif->tx->req_prod) &&
+              ((i-netif->tx_resp_prod) != NETIF_TX_RING_SIZE);
+          i++ )
+    {
+        make_tx_response(netif,
+                         netif->tx->ring[MASK_NETIF_TX_IDX(i)].req.id,
+                         NETIF_RSP_DROPPED);
+    }
+    netif->tx_req_cons = i;
+    spin_unlock(&netif->tx_lock);
+
+    return 0;
+}
+#endif
+
+irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+    netif_t *netif = dev_id;
+    if ( tx_work_exists(netif) )
+    {
+        add_to_net_schedule_list_tail(netif);
+        maybe_schedule_tx_action();
+    }
+    return IRQ_HANDLED;
+}
+
+static void make_tx_response(netif_t *netif, 
+                             u16      id,
+                             s8       st)
+{
+    NETIF_RING_IDX i = netif->tx_resp_prod;
+    netif_tx_response_t *resp;
+
+    resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp;
+    resp->id     = id;
+    resp->status = st;
+    wmb();
+    netif->tx->resp_prod = netif->tx_resp_prod = ++i;
+
+    mb(); /* Update producer before checking event threshold. */
+    if ( i == netif->tx->event )
+        notify_via_evtchn(netif->evtchn);
+}
+
+static int make_rx_response(netif_t *netif, 
+                            u16      id, 
+                            s8       st,
+                            memory_t addr,
+                            u16      size)
+{
+    NETIF_RING_IDX i = netif->rx_resp_prod;
+    netif_rx_response_t *resp;
+
+    resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
+    resp->addr   = addr;
+    resp->id     = id;
+    resp->status = (s16)size;
+    if ( st < 0 )
+        resp->status = (s16)st;
+    wmb();
+    netif->rx->resp_prod = netif->rx_resp_prod = ++i;
+
+    mb(); /* Update producer before checking event threshold. */
+    return (i == netif->rx->event);
+}
+
+static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
+{
+    struct list_head *ent;
+    netif_t *netif;
+    int i = 0;
+
+    printk(KERN_ALERT "netif_schedule_list:\n");
+    spin_lock_irq(&net_schedule_list_lock);
+
+    list_for_each ( ent, &net_schedule_list )
+    {
+        netif = list_entry(ent, netif_t, list);
+        printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n",
+               i, netif->rx_req_cons, netif->rx_resp_prod);               
+        printk(KERN_ALERT "   tx_req_cons=%08x tx_resp_prod=%08x)\n",
+               netif->tx_req_cons, netif->tx_resp_prod);
+        printk(KERN_ALERT "   shared(rx_req_prod=%08x rx_resp_prod=%08x\n",
+               netif->rx->req_prod, netif->rx->resp_prod);
+        printk(KERN_ALERT "   rx_event=%08x tx_req_prod=%08x\n",
+               netif->rx->event, netif->tx->req_prod);
+        printk(KERN_ALERT "   tx_resp_prod=%08x, tx_event=%08x)\n",
+               netif->tx->resp_prod, netif->tx->event);
+        i++;
+    }
+
+    spin_unlock_irq(&net_schedule_list_lock);
+    printk(KERN_ALERT " ** End of netif_schedule_list **\n");
+
+    return IRQ_HANDLED;
+}
+
+static int __init netback_init(void)
+{
+    int i;
+
+    if ( !(start_info.flags & SIF_NET_BE_DOMAIN) &&
+	 !(start_info.flags & SIF_INITDOMAIN) )
+        return 0;
+
+    printk("Initialising Xen netif backend\n");
+
+    skb_queue_head_init(&rx_queue);
+    skb_queue_head_init(&tx_queue);
+
+    netif_interface_init();
+
+    if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
+        BUG();
+
+    pending_cons = 0;
+    pending_prod = MAX_PENDING_REQS;
+    for ( i = 0; i < MAX_PENDING_REQS; i++ )
+        pending_ring[i] = i;
+
+    spin_lock_init(&net_schedule_list_lock);
+    INIT_LIST_HEAD(&net_schedule_list);
+
+    netif_ctrlif_init();
+
+    (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG),
+                      netif_be_dbg, SA_SHIRQ, 
+                      "net-be-dbg", &netif_be_dbg);
+
+    return 0;
+}
+
+static void netback_cleanup(void)
+{
+    BUG();
+}
+
+module_init(netback_init);
+module_exit(netback_cleanup);
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig b/linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig
similarity index 100%
rename from linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig
rename to linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile
new file mode 100644
index 0000000000..7eb07a2483
--- /dev/null
+++ b/linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile
@@ -0,0 +1,2 @@
+
+obj-y	:= netfront.o
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/net/network.c b/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c
similarity index 100%
rename from linux-2.6.7-xen-sparse/drivers/xen/net/network.c
rename to linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c
diff --git a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h
index ff65b7eb41..49bf0975ec 100644
--- a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h
+++ b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h
@@ -151,6 +151,12 @@ static inline int flush_page_update_queue(void)
 #define xen_flush_page_update_queue() (_flush_page_update_queue())
 void MULTICALL_flush_page_update_queue(void);
 
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+/* Allocate a contiguous empty region of low memory. Return virtual start. */
+unsigned long allocate_empty_lowmem_region(unsigned long pages);
+/* Deallocate a contiguous region of low memory. Return it to the allocator. */
+void deallocate_lowmem_region(unsigned long vstart, unsigned long pages);
+#endif
 
 /*
  * Assembler stubs for hyper-calls.
@@ -389,9 +395,12 @@ static inline int HYPERVISOR_update_va_mapping(
         "b" (page_nr), "c" ((new_val).pte_low), "d" (flags) : "memory" );
 
     if ( unlikely(ret < 0) )
-        panic("Failed update VA mapping: %08lx, %08lx, %08lx",
-              page_nr, (new_val).pte_low, flags);
-    
+    {
+        printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n",
+               page_nr, (new_val).pte_low, flags);
+        BUG();
+    }
+
     return ret;
 }
 
diff --git a/linux-2.6.7-xen-sparse/mm/page_alloc.c b/linux-2.6.7-xen-sparse/mm/page_alloc.c
new file mode 100644
index 0000000000..5d9b765d39
--- /dev/null
+++ b/linux-2.6.7-xen-sparse/mm/page_alloc.c
@@ -0,0 +1,2017 @@
+/*
+ *  linux/mm/page_alloc.c
+ *
+ *  Manages the free list, the system allocates free pages here.
+ *  Note that kmalloc() lives in slab.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
+ *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/topology.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+
+#include <asm/tlbflush.h>
+
+DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
+struct pglist_data *pgdat_list;
+unsigned long totalram_pages;
+unsigned long totalhigh_pages;
+int nr_swap_pages;
+int numnodes = 1;
+int sysctl_lower_zone_protection = 0;
+
+EXPORT_SYMBOL(totalram_pages);
+EXPORT_SYMBOL(nr_swap_pages);
+
+/*
+ * Used by page_zone() to look up the address of the struct zone whose
+ * id is encoded in the upper bits of page->flags
+ */
+struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+EXPORT_SYMBOL(zone_table);
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+int min_free_kbytes = 1024;
+
+/*
+ * Temporary debugging check for pages not lying within a given zone.
+ */
+static int bad_range(struct zone *zone, struct page *page)
+{
+	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
+		return 1;
+	if (page_to_pfn(page) < zone->zone_start_pfn)
+		return 1;
+	if (zone != page_zone(page))
+		return 1;
+	return 0;
+}
+
+static void bad_page(const char *function, struct page *page)
+{
+	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
+		function, current->comm, page);
+	printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n",
+		(unsigned long)page->flags, page->mapping,
+		(int)page->mapcount, page_count(page));
+	printk(KERN_EMERG "Backtrace:\n");
+	dump_stack();
+	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
+	page->flags &= ~(1 << PG_private	|
+			1 << PG_locked	|
+			1 << PG_lru	|
+			1 << PG_active	|
+			1 << PG_dirty	|
+			1 << PG_maplock |
+			1 << PG_anon    |
+			1 << PG_swapcache |
+			1 << PG_writeback);
+	set_page_count(page, 0);
+	page->mapping = NULL;
+	page->mapcount = 0;
+}
+
+#ifndef CONFIG_HUGETLB_PAGE
+#define prep_compound_page(page, order) do { } while (0)
+#define destroy_compound_page(page, order) do { } while (0)
+#else
+/*
+ * Higher-order pages are called "compound pages".  They are structured thusly:
+ *
+ * The first PAGE_SIZE page is called the "head page".
+ *
+ * The remaining PAGE_SIZE pages are called "tail pages".
+ *
+ * All pages have PG_compound set.  All pages have their ->private pointing at
+ * the head page (even the head page has this).
+ *
+ * The first tail page's ->mapping, if non-zero, holds the address of the
+ * compound page's put_page() function.
+ *
+ * The order of the allocation is stored in the first tail page's ->index
+ * This is only for debug at present.  This usage means that zero-order pages
+ * may not be compound.
+ */
+static void prep_compound_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+
+	page[1].mapping = 0;
+	page[1].index = order;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = page + i;
+
+		SetPageCompound(p);
+		p->private = (unsigned long)page;
+	}
+}
+
+static void destroy_compound_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+
+	if (!PageCompound(page))
+		return;
+
+	if (page[1].index != order)
+		bad_page(__FUNCTION__, page);
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = page + i;
+
+		if (!PageCompound(p))
+			bad_page(__FUNCTION__, page);
+		if (p->private != (unsigned long)page)
+			bad_page(__FUNCTION__, page);
+		ClearPageCompound(p);
+	}
+}
+#endif		/* CONFIG_HUGETLB_PAGE */
+
+/*
+ * Freeing function for a buddy system allocator.
+ *
+ * The concept of a buddy system is to maintain direct-mapped table
+ * (containing bit values) for memory blocks of various "orders".
+ * The bottom level table contains the map for the smallest allocatable
+ * units of memory (here, pages), and each level above it describes
+ * pairs of units from the levels below, hence, "buddies".
+ * At a high level, all that happens here is marking the table entry
+ * at the bottom level available, and propagating the changes upward
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep one bit for each pair of blocks, which
+ * is set to 1 iff only one of the pair is allocated.  So when we
+ * are allocating or freeing one, we can derive the state of the
+ * other.  That is, if we allocate a small block, and both were   
+ * free, the remainder of the region must be split into blocks.   
+ * If a block is freed, and its buddy is also free, then this
+ * triggers coalescing into a block of larger size.            
+ *
+ * -- wli
+ */
+
+static inline void __free_pages_bulk (struct page *page, struct page *base,
+		struct zone *zone, struct free_area *area, unsigned long mask,
+		unsigned int order)
+{
+	unsigned long page_idx, index;
+
+	if (order)
+		destroy_compound_page(page, order);
+	page_idx = page - base;
+	if (page_idx & ~mask)
+		BUG();
+	index = page_idx >> (1 + order);
+
+	zone->free_pages -= mask;
+	while (mask + (1 << (MAX_ORDER-1))) {
+		struct page *buddy1, *buddy2;
+
+		BUG_ON(area >= zone->free_area + MAX_ORDER);
+		if (!__test_and_change_bit(index, area->map))
+			/*
+			 * the buddy page is still allocated.
+			 */
+			break;
+		/*
+		 * Move the buddy up one level.
+		 * This code is taking advantage of the identity:
+		 * 	-mask = 1+~mask
+		 */
+		buddy1 = base + (page_idx ^ -mask);
+		buddy2 = base + page_idx;
+		BUG_ON(bad_range(zone, buddy1));
+		BUG_ON(bad_range(zone, buddy2));
+		list_del(&buddy1->lru);
+		mask <<= 1;
+		area++;
+		index >>= 1;
+		page_idx &= mask;
+	}
+	list_add(&(base + page_idx)->lru, &area->free_list);
+}
+
+static inline void free_pages_check(const char *function, struct page *page)
+{
+	if (	page_mapped(page) ||
+		page->mapping != NULL ||
+		page_count(page) != 0 ||
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_reclaim	|
+			1 << PG_slab	|
+			1 << PG_maplock |
+			1 << PG_anon    |
+			1 << PG_swapcache |
+			1 << PG_writeback )))
+		bad_page(function, page);
+	if (PageDirty(page))
+		ClearPageDirty(page);
+}
+
+/*
+ * Frees a list of pages. 
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free, or 0 for all on the list.
+ *
+ * If the zone was previously in an "all pages pinned" state then look to
+ * see if this freeing clears that state.
+ *
+ * And clear the zone's pages_scanned counter, to hold off the "all pages are
+ * pinned" detection logic.
+ */
+static int
+free_pages_bulk(struct zone *zone, int count,
+		struct list_head *list, unsigned int order)
+{
+	unsigned long mask, flags;
+	struct free_area *area;
+	struct page *base, *page = NULL;
+	int ret = 0;
+
+	mask = (~0UL) << order;
+	base = zone->zone_mem_map;
+	area = zone->free_area + order;
+	spin_lock_irqsave(&zone->lock, flags);
+	zone->all_unreclaimable = 0;
+	zone->pages_scanned = 0;
+	while (!list_empty(list) && count--) {
+		page = list_entry(list->prev, struct page, lru);
+		/* have to delete it as __free_pages_bulk list manipulates */
+		list_del(&page->lru);
+		__free_pages_bulk(page, base, zone, area, mask, order);
+		ret++;
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return ret;
+}
+
+void __free_pages_ok(struct page *page, unsigned int order)
+{
+	LIST_HEAD(list);
+	int i;
+
+	mod_page_state(pgfree, 1 << order);
+	for (i = 0 ; i < (1 << order) ; ++i)
+		free_pages_check(__FUNCTION__, page + i);
+	list_add(&page->lru, &list);
+	kernel_map_pages(page, 1<<order, 0);
+	free_pages_bulk(page_zone(page), 1, &list, order);
+}
+
+#define MARK_USED(index, order, area) \
+	__change_bit((index) >> (1+(order)), (area)->map)
+
+static inline struct page *
+expand(struct zone *zone, struct page *page,
+	 unsigned long index, int low, int high, struct free_area *area)
+{
+	unsigned long size = 1 << high;
+
+	while (high > low) {
+		BUG_ON(bad_range(zone, page));
+		area--;
+		high--;
+		size >>= 1;
+		list_add(&page->lru, &area->free_list);
+		MARK_USED(index, high, area);
+		index += size;
+		page += size;
+	}
+	return page;
+}
+
+static inline void set_page_refs(struct page *page, int order)
+{
+#ifdef CONFIG_MMU
+	set_page_count(page, 1);
+#else
+	int i;
+
+	/*
+	 * We need to reference all the pages for this order, otherwise if
+	 * anyone accesses one of the pages with (get/put) it will be freed.
+	 */
+	for (i = 0; i < (1 << order); i++)
+		set_page_count(page+i, 1);
+#endif /* CONFIG_MMU */
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static void prep_new_page(struct page *page, int order)
+{
+	if (page->mapping || page_mapped(page) ||
+	    (page->flags & (
+			1 << PG_private	|
+			1 << PG_locked	|
+			1 << PG_lru	|
+			1 << PG_active	|
+			1 << PG_dirty	|
+			1 << PG_reclaim	|
+			1 << PG_maplock |
+			1 << PG_anon    |
+			1 << PG_swapcache |
+			1 << PG_writeback )))
+		bad_page(__FUNCTION__, page);
+
+	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+			1 << PG_referenced | 1 << PG_arch_1 |
+			1 << PG_checked | 1 << PG_mappedtodisk);
+	page->private = 0;
+	set_page_refs(page, order);
+}
+
+/* 
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order)
+{
+	struct free_area * area;
+	unsigned int current_order;
+	struct page *page;
+	unsigned int index;
+
+	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+		area = zone->free_area + current_order;
+		if (list_empty(&area->free_list))
+			continue;
+
+		page = list_entry(area->free_list.next, struct page, lru);
+		list_del(&page->lru);
+		index = page - zone->zone_mem_map;
+		if (current_order != MAX_ORDER-1)
+			MARK_USED(index, current_order, area);
+		zone->free_pages -= 1UL << order;
+		return expand(zone, page, index, order, current_order, area);
+	}
+
+	return NULL;
+}
+
+/* 
+ * Obtain a specified number of elements from the buddy allocator, all under
+ * a single hold of the lock, for efficiency.  Add them to the supplied list.
+ * Returns the number of new pages which were placed at *list.
+ */
+static int rmqueue_bulk(struct zone *zone, unsigned int order, 
+			unsigned long count, struct list_head *list)
+{
+	unsigned long flags;
+	int i;
+	int allocated = 0;
+	struct page *page;
+	
+	spin_lock_irqsave(&zone->lock, flags);
+	for (i = 0; i < count; ++i) {
+		page = __rmqueue(zone, order);
+		if (page == NULL)
+			break;
+		allocated++;
+		list_add_tail(&page->lru, list);
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return allocated;
+}
+
+#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
+static void __drain_pages(unsigned int cpu)
+{
+	struct zone *zone;
+	int i;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset;
+
+		pset = &zone->pageset[cpu];
+		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &pset->pcp[i];
+			pcp->count -= free_pages_bulk(zone, pcp->count,
+						&pcp->list, 0);
+		}
+	}
+}
+#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_PM
+int is_head_of_free_region(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        unsigned long flags;
+	int order;
+	struct list_head *curr;
+
+	/*
+	 * Should not matter as we need quiescent system for
+	 * suspend anyway, but...
+	 */
+	spin_lock_irqsave(&zone->lock, flags);
+	for (order = MAX_ORDER - 1; order >= 0; --order)
+		list_for_each(curr, &zone->free_area[order].free_list)
+			if (page == list_entry(curr, struct page, lru)) {
+				spin_unlock_irqrestore(&zone->lock, flags);
+				return 1 << order;
+			}
+	spin_unlock_irqrestore(&zone->lock, flags);
+        return 0;
+}
+
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);	
+	__drain_pages(smp_processor_id());
+	local_irq_restore(flags);	
+}
+#endif /* CONFIG_PM */
+
+static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+{
+#ifdef CONFIG_NUMA
+	unsigned long flags;
+	int cpu;
+	pg_data_t *pg = z->zone_pgdat;
+	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
+	struct per_cpu_pageset *p;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	p = &z->pageset[cpu];
+	if (pg == orig) {
+		z->pageset[cpu].numa_hit++;
+	} else {
+		p->numa_miss++;
+		zonelist->zones[0]->pageset[cpu].numa_foreign++;
+	}
+	if (pg == NODE_DATA(numa_node_id()))
+		p->local_node++;
+	else
+		p->other_node++;
+	local_irq_restore(flags);
+#endif
+}
+
+/*
+ * Free a 0-order page
+ */
+static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
+static void fastcall free_hot_cold_page(struct page *page, int cold)
+{
+	struct zone *zone = page_zone(page);
+	struct per_cpu_pages *pcp;
+	unsigned long flags;
+
+	/* XXX Xen: use mapping pointer as skb/data-page destructor */
+	if (page->mapping)
+		return (*(void(*)(struct page *))page->mapping)(page);
+
+	kernel_map_pages(page, 1, 0);
+	inc_page_state(pgfree);
+	free_pages_check(__FUNCTION__, page);
+	pcp = &zone->pageset[get_cpu()].pcp[cold];
+	local_irq_save(flags);
+	if (pcp->count >= pcp->high)
+		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+	list_add(&page->lru, &pcp->list);
+	pcp->count++;
+	local_irq_restore(flags);
+	put_cpu();
+}
+
+void fastcall free_hot_page(struct page *page)
+{
+	free_hot_cold_page(page, 0);
+}
+	
+void fastcall free_cold_page(struct page *page)
+{
+	free_hot_cold_page(page, 1);
+}
+
+/*
+ * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
+ * we cheat by calling it from here, in the order > 0 path.  Saves a branch
+ * or two.
+ */
+
+static struct page *
+buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
+{
+	unsigned long flags;
+	struct page *page = NULL;
+	int cold = !!(gfp_flags & __GFP_COLD);
+
+	if (order == 0) {
+		struct per_cpu_pages *pcp;
+
+		pcp = &zone->pageset[get_cpu()].pcp[cold];
+		local_irq_save(flags);
+		if (pcp->count <= pcp->low)
+			pcp->count += rmqueue_bulk(zone, 0,
+						pcp->batch, &pcp->list);
+		if (pcp->count) {
+			page = list_entry(pcp->list.next, struct page, lru);
+			list_del(&page->lru);
+			pcp->count--;
+		}
+		local_irq_restore(flags);
+		put_cpu();
+	}
+
+	if (page == NULL) {
+		spin_lock_irqsave(&zone->lock, flags);
+		page = __rmqueue(zone, order);
+		spin_unlock_irqrestore(&zone->lock, flags);
+	}
+
+	if (page != NULL) {
+		BUG_ON(bad_range(zone, page));
+		mod_page_state_zone(zone, pgalloc, 1 << order);
+		prep_new_page(page, order);
+		if (order && (gfp_flags & __GFP_COMP))
+			prep_compound_page(page, order);
+	}
+	return page;
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ *
+ * Herein lies the mysterious "incremental min".  That's the
+ *
+ *	local_low = z->pages_low;
+ *	min += local_low;
+ *
+ * thing.  The intent here is to provide additional protection to low zones for
+ * allocation requests which _could_ use higher zones.  So a GFP_HIGHMEM
+ * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
+ * request.  This preserves additional space in those lower zones for requests
+ * which really do need memory from those zones.  It means that on a decent
+ * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
+ * zone untouched.
+ */
+struct page * fastcall
+__alloc_pages(unsigned int gfp_mask, unsigned int order,
+		struct zonelist *zonelist)
+{
+	const int wait = gfp_mask & __GFP_WAIT;
+	unsigned long min;
+	struct zone **zones;
+	struct page *page;
+	struct reclaim_state reclaim_state;
+	struct task_struct *p = current;
+	int i;
+	int alloc_type;
+	int do_retry;
+
+	might_sleep_if(wait);
+
+	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
+	if (zones[0] == NULL)     /* no zones in the zonelist */
+		return NULL;
+
+	alloc_type = zone_idx(zones[0]);
+
+	/* Go through the zonelist once, looking for a zone with enough free */
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *z = zones[i];
+
+		min = (1<<order) + z->protection[alloc_type];
+
+		/*
+		 * We let real-time tasks dip their real-time paws a little
+		 * deeper into reserves.
+		 */
+		if (rt_task(p))
+			min -= z->pages_low >> 1;
+
+		if (z->free_pages >= min ||
+				(!wait && z->free_pages >= z->pages_high)) {
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page) {
+				zone_statistics(zonelist, z);
+				goto got_pg;
+			}
+		}
+	}
+
+	/* we're somewhat low on memory, failed to find what we needed */
+	for (i = 0; zones[i] != NULL; i++)
+		wakeup_kswapd(zones[i]);
+
+	/* Go through the zonelist again, taking __GFP_HIGH into account */
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *z = zones[i];
+
+		min = (1<<order) + z->protection[alloc_type];
+
+		if (gfp_mask & __GFP_HIGH)
+			min -= z->pages_low >> 2;
+		if (rt_task(p))
+			min -= z->pages_low >> 1;
+
+		if (z->free_pages >= min ||
+				(!wait && z->free_pages >= z->pages_high)) {
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page) {
+				zone_statistics(zonelist, z);
+				goto got_pg;
+			}
+		}
+	}
+
+	/* here we're in the low on memory slow path */
+
+rebalance:
+	if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
+		/* go through the zonelist yet again, ignoring mins */
+		for (i = 0; zones[i] != NULL; i++) {
+			struct zone *z = zones[i];
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page) {
+				zone_statistics(zonelist, z);
+				goto got_pg;
+			}
+		}
+		goto nopage;
+	}
+
+	/* Atomic allocations - we can't balance anything */
+	if (!wait)
+		goto nopage;
+
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+
+	try_to_free_pages(zones, gfp_mask, order);
+
+	p->reclaim_state = NULL;
+	p->flags &= ~PF_MEMALLOC;
+
+	/* go through the zonelist yet one more time */
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *z = zones[i];
+
+		min = (1UL << order) + z->protection[alloc_type];
+
+		if (z->free_pages >= min ||
+				(!wait && z->free_pages >= z->pages_high)) {
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page) {
+ 				zone_statistics(zonelist, z);
+				goto got_pg;
+			}
+		}
+	}
+
+	/*
+	 * Don't let big-order allocations loop unless the caller explicitly
+	 * requests that.  Wait for some write requests to complete then retry.
+	 *
+	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
+	 * may not be true in other implementations.
+	 */
+	do_retry = 0;
+	if (!(gfp_mask & __GFP_NORETRY)) {
+		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+			do_retry = 1;
+		if (gfp_mask & __GFP_NOFAIL)
+			do_retry = 1;
+	}
+	if (do_retry) {
+		blk_congestion_wait(WRITE, HZ/50);
+		goto rebalance;
+	}
+
+nopage:
+	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+		printk(KERN_WARNING "%s: page allocation failure."
+			" order:%d, mode:0x%x\n",
+			p->comm, order, gfp_mask);
+		dump_stack();
+	}
+	return NULL;
+got_pg:
+	kernel_map_pages(page, 1 << order, 1);
+	return page;
+}
+
+EXPORT_SYMBOL(__alloc_pages);
+
+#ifdef CONFIG_NUMA
+/* Early boot: Everything is done by one cpu, but the data structures will be
+ * used by all cpus - spread them on all nodes.
+ */
+static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order)
+{
+static int nodenr;
+	int i = nodenr;
+	struct page *page;
+
+	for (;;) {
+		if (i > nodenr + numnodes)
+			return 0;
+		if (node_present_pages(i%numnodes)) {
+			struct zone **z;
+			/* The node contains memory. Check that there is
+			 * memory in the intended zonelist.
+			 */
+			z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones;
+			while (*z) {
+				if ( (*z)->free_pages > (1UL<<order))
+					goto found_node;
+				z++;
+			}
+		}
+		i++;
+	}
+found_node:
+	nodenr = i+1;
+	page = alloc_pages_node(i%numnodes, gfp_mask, order);
+	if (!page)
+		return 0;
+	return (unsigned long) page_address(page);
+}
+#endif
+
+/*
+ * Common helper functions.
+ */
+fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
+{
+	struct page * page;
+
+#ifdef CONFIG_NUMA
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return get_boot_pages(gfp_mask, order);
+#endif
+	page = alloc_pages(gfp_mask, order);
+	if (!page)
+		return 0;
+	return (unsigned long) page_address(page);
+}
+
+EXPORT_SYMBOL(__get_free_pages);
+
+fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
+{
+	struct page * page;
+
+	/*
+	 * get_zeroed_page() returns a 32-bit address, which cannot represent
+	 * a highmem page
+	 */
+	BUG_ON(gfp_mask & __GFP_HIGHMEM);
+
+	page = alloc_pages(gfp_mask, 0);
+	if (page) {
+		void *address = page_address(page);
+		clear_page(address);
+		return (unsigned long) address;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(get_zeroed_page);
+
+void __pagevec_free(struct pagevec *pvec)
+{
+	int i = pagevec_count(pvec);
+
+	while (--i >= 0)
+		free_hot_cold_page(pvec->pages[i], pvec->cold);
+}
+
+fastcall void __free_pages(struct page *page, unsigned int order)
+{
+	if (!PageReserved(page) && put_page_testzero(page)) {
+		if (order == 0)
+			free_hot_page(page);
+		else
+			__free_pages_ok(page, order);
+	}
+}
+
+EXPORT_SYMBOL(__free_pages);
+
+fastcall void free_pages(unsigned long addr, unsigned int order)
+{
+	if (addr != 0) {
+		BUG_ON(!virt_addr_valid(addr));
+		__free_pages(virt_to_page(addr), order);
+	}
+}
+
+EXPORT_SYMBOL(free_pages);
+
+/*
+ * Total amount of free (allocatable) RAM:
+ */
+unsigned int nr_free_pages(void)
+{
+	unsigned int sum = 0;
+	struct zone *zone;
+
+	for_each_zone(zone)
+		sum += zone->free_pages;
+
+	return sum;
+}
+
+EXPORT_SYMBOL(nr_free_pages);
+
+unsigned int nr_used_zone_pages(void)
+{
+	unsigned int pages = 0;
+	struct zone *zone;
+
+	for_each_zone(zone)
+		pages += zone->nr_active + zone->nr_inactive;
+
+	return pages;
+}
+
+#ifdef CONFIG_NUMA
+unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
+{
+	unsigned int i, sum = 0;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		sum += pgdat->node_zones[i].free_pages;
+
+	return sum;
+}
+#endif
+
+static unsigned int nr_free_zone_pages(int offset)
+{
+	pg_data_t *pgdat;
+	unsigned int sum = 0;
+
+	for_each_pgdat(pgdat) {
+		struct zonelist *zonelist = pgdat->node_zonelists + offset;
+		struct zone **zonep = zonelist->zones;
+		struct zone *zone;
+
+		for (zone = *zonep++; zone; zone = *zonep++) {
+			unsigned long size = zone->present_pages;
+			unsigned long high = zone->pages_high;
+			if (size > high)
+				sum += size - high;
+		}
+	}
+
+	return sum;
+}
+
+/*
+ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ */
+unsigned int nr_free_buffer_pages(void)
+{
+	return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
+}
+
+/*
+ * Amount of free RAM allocatable within all zones
+ */
+unsigned int nr_free_pagecache_pages(void)
+{
+	return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
+}
+
+#ifdef CONFIG_HIGHMEM
+unsigned int nr_free_highpages (void)
+{
+	pg_data_t *pgdat;
+	unsigned int pages = 0;
+
+	for_each_pgdat(pgdat)
+		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+
+	return pages;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+static void show_node(struct zone *zone)
+{
+	printk("Node %d ", zone->zone_pgdat->node_id);
+}
+#else
+#define show_node(zone)	do { } while (0)
+#endif
+
+/*
+ * Accumulate the page_state information across all CPUs.
+ * The result is unavoidably approximate - it can change
+ * during and after execution of this function.
+ */
+DEFINE_PER_CPU(struct page_state, page_states) = {0};
+EXPORT_PER_CPU_SYMBOL(page_states);
+
+atomic_t nr_pagecache = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_pagecache);
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
+#endif
+
+void __get_page_state(struct page_state *ret, int nr)
+{
+	int cpu = 0;
+
+	memset(ret, 0, sizeof(*ret));
+	while (cpu < NR_CPUS) {
+		unsigned long *in, *out, off;
+
+		if (!cpu_possible(cpu)) {
+			cpu++;
+			continue;
+		}
+
+		in = (unsigned long *)&per_cpu(page_states, cpu);
+		cpu++;
+		if (cpu < NR_CPUS && cpu_possible(cpu))
+			prefetch(&per_cpu(page_states, cpu));
+		out = (unsigned long *)ret;
+		for (off = 0; off < nr; off++)
+			*out++ += *in++;
+	}
+}
+
+void get_page_state(struct page_state *ret)
+{
+	int nr;
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr + 1);
+}
+
+void get_full_page_state(struct page_state *ret)
+{
+	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
+}
+
+unsigned long __read_page_state(unsigned offset)
+{
+	unsigned long ret = 0;
+	int cpu;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		unsigned long in;
+
+		if (!cpu_possible(cpu))
+			continue;
+
+		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
+		ret += *((unsigned long *)in);
+	}
+	return ret;
+}
+
+void get_zone_counts(unsigned long *active,
+		unsigned long *inactive, unsigned long *free)
+{
+	struct zone *zone;
+
+	*active = 0;
+	*inactive = 0;
+	*free = 0;
+	for_each_zone(zone) {
+		*active += zone->nr_active;
+		*inactive += zone->nr_inactive;
+		*free += zone->free_pages;
+	}
+}
+
+void si_meminfo(struct sysinfo *val)
+{
+	val->totalram = totalram_pages;
+	val->sharedram = 0;
+	val->freeram = nr_free_pages();
+	val->bufferram = nr_blockdev_pages();
+#ifdef CONFIG_HIGHMEM
+	val->totalhigh = totalhigh_pages;
+	val->freehigh = nr_free_highpages();
+#else
+	val->totalhigh = 0;
+	val->freehigh = 0;
+#endif
+	val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	val->totalram = pgdat->node_present_pages;
+	val->freeram = nr_free_pages_pgdat(pgdat);
+	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+	val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ */
+void show_free_areas(void)
+{
+	struct page_state ps;
+	int cpu, temperature;
+	unsigned long active;
+	unsigned long inactive;
+	unsigned long free;
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		show_node(zone);
+		printk("%s per-cpu:", zone->name);
+
+		if (!zone->present_pages) {
+			printk(" empty\n");
+			continue;
+		} else
+			printk("\n");
+
+		for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+			struct per_cpu_pageset *pageset;
+
+			if (!cpu_possible(cpu))
+				continue;
+
+			pageset = zone->pageset + cpu;
+
+			for (temperature = 0; temperature < 2; temperature++)
+				printk("cpu %d %s: low %d, high %d, batch %d\n",
+					cpu,
+					temperature ? "cold" : "hot",
+					pageset->pcp[temperature].low,
+					pageset->pcp[temperature].high,
+					pageset->pcp[temperature].batch);
+		}
+	}
+
+	get_page_state(&ps);
+	get_zone_counts(&active, &inactive, &free);
+
+	printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+		K(nr_free_pages()),
+		K(nr_free_highpages()));
+
+	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
+		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+		active,
+		inactive,
+		ps.nr_dirty,
+		ps.nr_writeback,
+		ps.nr_unstable,
+		nr_free_pages(),
+		ps.nr_slab,
+		ps.nr_mapped,
+		ps.nr_page_table_pages);
+
+	for_each_zone(zone) {
+		int i;
+
+		show_node(zone);
+		printk("%s"
+			" free:%lukB"
+			" min:%lukB"
+			" low:%lukB"
+			" high:%lukB"
+			" active:%lukB"
+			" inactive:%lukB"
+			" present:%lukB"
+			"\n",
+			zone->name,
+			K(zone->free_pages),
+			K(zone->pages_min),
+			K(zone->pages_low),
+			K(zone->pages_high),
+			K(zone->nr_active),
+			K(zone->nr_inactive),
+			K(zone->present_pages)
+			);
+		printk("protections[]:");
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			printk(" %lu", zone->protection[i]);
+		printk("\n");
+	}
+
+	for_each_zone(zone) {
+		struct list_head *elem;
+ 		unsigned long nr, flags, order, total = 0;
+
+		show_node(zone);
+		printk("%s: ", zone->name);
+		if (!zone->present_pages) {
+			printk("empty\n");
+			continue;
+		}
+
+		spin_lock_irqsave(&zone->lock, flags);
+		for (order = 0; order < MAX_ORDER; order++) {
+			nr = 0;
+			list_for_each(elem, &zone->free_area[order].free_list)
+				++nr;
+			total += nr << order;
+			printk("%lu*%lukB ", nr, K(1UL) << order);
+		}
+		spin_unlock_irqrestore(&zone->lock, flags);
+		printk("= %lukB\n", K(total));
+	}
+
+	show_swap_cache_info();
+}
+
+/*
+ * Builds allocation fallback zone lists.
+ */
+static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+{
+	switch (k) {
+		struct zone *zone;
+	default:
+		BUG();
+	case ZONE_HIGHMEM:
+		zone = pgdat->node_zones + ZONE_HIGHMEM;
+		if (zone->present_pages) {
+#ifndef CONFIG_HIGHMEM
+			BUG();
+#endif
+			zonelist->zones[j++] = zone;
+		}
+	case ZONE_NORMAL:
+		zone = pgdat->node_zones + ZONE_NORMAL;
+		if (zone->present_pages)
+			zonelist->zones[j++] = zone;
+	case ZONE_DMA:
+		zone = pgdat->node_zones + ZONE_DMA;
+		if (zone->present_pages)
+			zonelist->zones[j++] = zone;
+	}
+
+	return j;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (numnodes)
+static int __initdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given
+ *    node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: pointer to the bitmap of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list.  The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __init find_next_best_node(int node, void *used_node_mask)
+{
+	int i, n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	for (i = 0; i < numnodes; i++) {
+		cpumask_t tmp;
+
+		/* Start from local node */
+		n = (node+i)%numnodes;
+
+		/* Don't want a node to appear more than once */
+		if (test_bit(n, used_node_mask))
+			continue;
+
+		/* Use the distance array to find the distance */
+		val = node_distance(node, n);
+
+		/* Give preference to headless and unused nodes */
+		tmp = node_to_cpumask(n);
+		if (!cpus_empty(tmp))
+			val += PENALTY_FOR_NODE_WITH_CPUS;
+
+		/* Slight preference for less loaded node */
+		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+		val += node_load[n];
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node >= 0)
+		set_bit(best_node, used_node_mask);
+
+	return best_node;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+	int prev_node, load;
+	struct zonelist *zonelist;
+	DECLARE_BITMAP(used_mask, MAX_NUMNODES);
+
+	/* initialize zonelists */
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		memset(zonelist, 0, sizeof(*zonelist));
+		zonelist->zones[0] = NULL;
+	}
+
+	/* NUMA-aware ordering of nodes */
+	local_node = pgdat->node_id;
+	load = numnodes;
+	prev_node = local_node;
+	bitmap_zero(used_mask, MAX_NUMNODES);
+	while ((node = find_next_best_node(local_node, used_mask)) >= 0) {
+		/*
+		 * We don't want to pressure a particular node.
+		 * So adding penalty to the first node in same
+		 * distance group to make it round-robin.
+		 */
+		if (node_distance(local_node, node) !=
+				node_distance(local_node, prev_node))
+			node_load[node] += load;
+		prev_node = node;
+		load--;
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			zonelist = pgdat->node_zonelists + i;
+			for (j = 0; zonelist->zones[j] != NULL; j++);
+
+			k = ZONE_NORMAL;
+			if (i & __GFP_HIGHMEM)
+				k = ZONE_HIGHMEM;
+			if (i & __GFP_DMA)
+				k = ZONE_DMA;
+
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			zonelist->zones[j] = NULL;
+		}
+	}
+}
+
+#else	/* CONFIG_NUMA */
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+
+	local_node = pgdat->node_id;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		struct zonelist *zonelist;
+
+		zonelist = pgdat->node_zonelists + i;
+		memset(zonelist, 0, sizeof(*zonelist));
+
+		j = 0;
+		k = ZONE_NORMAL;
+		if (i & __GFP_HIGHMEM)
+			k = ZONE_HIGHMEM;
+		if (i & __GFP_DMA)
+			k = ZONE_DMA;
+
+ 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		/*
+ 		 * Now we build the zonelist so that it contains the zones
+ 		 * of all the other nodes.
+ 		 * We don't want to pressure a particular node, so when
+ 		 * building the zones for node N, we make sure that the
+ 		 * zones coming right after the local ones are those from
+ 		 * node N+1 (modulo N)
+ 		 */
+ 		for (node = local_node + 1; node < numnodes; node++)
+ 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ 		for (node = 0; node < local_node; node++)
+ 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ 
+		zonelist->zones[j] = NULL;
+	}
+}
+
+#endif	/* CONFIG_NUMA */
+
+void __init build_all_zonelists(void)
+{
+	int i;
+
+	for(i = 0 ; i < numnodes ; i++)
+		build_zonelists(NODE_DATA(i));
+	printk("Built %i zonelists\n", numnodes);
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE	256
+
+static inline unsigned long wait_table_size(unsigned long pages)
+{
+	unsigned long size = 1;
+
+	pages /= PAGES_PER_WAITQUEUE;
+
+	while (size < pages)
+		size <<= 1;
+
+	/*
+	 * Once we have dozens or even hundreds of threads sleeping
+	 * on IO we've got bigger problems than wait queue collision.
+	 * Limit the size of the wait table to a reasonable size.
+	 */
+	size = min(size, 4096UL);
+
+	return max(size, 4UL);
+}
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+	return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		totalpages += zones_size[i];
+	pgdat->node_spanned_pages = totalpages;
+
+	realtotalpages = totalpages;
+	if (zholes_size)
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			realtotalpages -= zholes_size[i];
+	pgdat->node_present_pages = realtotalpages;
+	printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __init memmap_init_zone(struct page *start, unsigned long size, int nid,
+		unsigned long zone, unsigned long start_pfn)
+{
+	struct page *page;
+
+	for (page = start; page < (start + size); page++) {
+		set_page_zone(page, NODEZONE(nid, zone));
+		set_page_count(page, 0);
+		SetPageReserved(page);
+		INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
+		if (zone != ZONE_HIGHMEM)
+			set_page_address(page, __va(start_pfn << PAGE_SHIFT));
+#endif
+		start_pfn++;
+	}
+}
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(start, size, nid, zone, start_pfn) \
+	memmap_init_zone((start), (size), (nid), (zone), (start_pfn))
+#endif
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long i, j;
+	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+	int cpu, nid = pgdat->node_id;
+	struct page *lmem_map = pgdat->node_mem_map;
+	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+
+	pgdat->nr_zones = 0;
+	init_waitqueue_head(&pgdat->kswapd_wait);
+	
+	for (j = 0; j < MAX_NR_ZONES; j++) {
+		struct zone *zone = pgdat->node_zones + j;
+		unsigned long size, realsize;
+		unsigned long batch;
+
+		zone_table[NODEZONE(nid, j)] = zone;
+		realsize = size = zones_size[j];
+		if (zholes_size)
+			realsize -= zholes_size[j];
+
+		zone->spanned_pages = size;
+		zone->present_pages = realsize;
+		zone->name = zone_names[j];
+		spin_lock_init(&zone->lock);
+		spin_lock_init(&zone->lru_lock);
+		zone->zone_pgdat = pgdat;
+		zone->free_pages = 0;
+
+		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+		/*
+		 * The per-cpu-pages pools are set to around 1000th of the
+		 * size of the zone.  But no more than 1/4 of a meg - there's
+		 * no point in going beyond the size of L2 cache.
+		 *
+		 * OK, so we don't know how big the cache is.  So guess.
+		 */
+		batch = zone->present_pages / 1024;
+		if (batch * PAGE_SIZE > 256 * 1024)
+			batch = (256 * 1024) / PAGE_SIZE;
+		batch /= 4;		/* We effectively *= 4 below */
+		if (batch < 1)
+			batch = 1;
+
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
+			pcp->count = 0;
+			pcp->low = 2 * batch;
+			pcp->high = 6 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+
+			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
+			pcp->count = 0;
+			pcp->low = 0;
+			pcp->high = 2 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+		}
+		printk("  %s zone: %lu pages, LIFO batch:%lu\n",
+				zone_names[j], realsize, batch);
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_list);
+		atomic_set(&zone->nr_scan_active, 0);
+		atomic_set(&zone->nr_scan_inactive, 0);
+		zone->nr_active = 0;
+		zone->nr_inactive = 0;
+		if (!size)
+			continue;
+
+		/*
+		 * The per-page waitqueue mechanism uses hashed waitqueues
+		 * per zone.
+		 */
+		zone->wait_table_size = wait_table_size(size);
+		zone->wait_table_bits =
+			wait_table_bits(zone->wait_table_size);
+		zone->wait_table = (wait_queue_head_t *)
+			alloc_bootmem_node(pgdat, zone->wait_table_size
+						* sizeof(wait_queue_head_t));
+
+		for(i = 0; i < zone->wait_table_size; ++i)
+			init_waitqueue_head(zone->wait_table + i);
+
+		pgdat->nr_zones = j+1;
+
+		zone->zone_mem_map = lmem_map;
+		zone->zone_start_pfn = zone_start_pfn;
+
+		if ((zone_start_pfn) & (zone_required_alignment-1))
+			printk("BUG: wrong zone alignment, it will crash\n");
+
+		memmap_init(lmem_map, size, nid, j, zone_start_pfn);
+
+		zone_start_pfn += size;
+		lmem_map += size;
+
+		for (i = 0; ; i++) {
+			unsigned long bitmap_size;
+
+			INIT_LIST_HEAD(&zone->free_area[i].free_list);
+			if (i == MAX_ORDER-1) {
+				zone->free_area[i].map = NULL;
+				break;
+			}
+
+			/*
+			 * Page buddy system uses "index >> (i+1)",
+			 * where "index" is at most "size-1".
+			 *
+			 * The extra "+3" is to round down to byte
+			 * size (8 bits per byte assumption). Thus
+			 * we get "(size-1) >> (i+4)" as the last byte
+			 * we can access.
+			 *
+			 * The "+1" is because we want to round the
+			 * byte allocation up rather than down. So
+			 * we should have had a "+7" before we shifted
+			 * down by three. Also, we have to add one as
+			 * we actually _use_ the last bit (it's [0,n]
+			 * inclusive, not [0,n[).
+			 *
+			 * So we actually had +7+1 before we shift
+			 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
+			 * (modulo overflows, which we do not have).
+			 *
+			 * Finally, we LONG_ALIGN because all bitmap
+			 * operations are on longs.
+			 */
+			bitmap_size = (size-1) >> (i+4);
+			bitmap_size = LONG_ALIGN(bitmap_size+1);
+			zone->free_area[i].map = 
+			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
+		}
+	}
+}
+
+void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+		struct page *node_mem_map, unsigned long *zones_size,
+		unsigned long node_start_pfn, unsigned long *zholes_size)
+{
+	unsigned long size;
+
+	pgdat->node_id = nid;
+	pgdat->node_start_pfn = node_start_pfn;
+	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+	if (!node_mem_map) {
+		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+		node_mem_map = alloc_bootmem_node(pgdat, size);
+	}
+	pgdat->node_mem_map = node_mem_map;
+
+	free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifndef CONFIG_DISCONTIGMEM
+static bootmem_data_t contig_bootmem_data;
+struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
+
+EXPORT_SYMBOL(contig_page_data);
+
+void __init free_area_init(unsigned long *zones_size)
+{
+	free_area_init_node(0, &contig_page_data, NULL, zones_size,
+			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+	mem_map = contig_page_data.node_mem_map;
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+	pg_data_t *pgdat;
+	loff_t node = *pos;
+
+	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
+		--node;
+
+	return pgdat;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	pg_data_t *pgdat = (pg_data_t *)arg;
+
+	(*pos)++;
+	return pgdat->pgdat_next;
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+/* 
+ * This walks the freelist for each zone. Whilst this is slow, I'd rather 
+ * be slow here than slow down the fast path by keeping stats - mjbligh
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+	pg_data_t *pgdat = (pg_data_t *)arg;
+	struct zone *zone;
+	struct zone *node_zones = pgdat->node_zones;
+	unsigned long flags;
+	int order;
+
+	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+		if (!zone->present_pages)
+			continue;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+		for (order = 0; order < MAX_ORDER; ++order) {
+			unsigned long nr_bufs = 0;
+			struct list_head *elem;
+
+			list_for_each(elem, &(zone->free_area[order].free_list))
+				++nr_bufs;
+			seq_printf(m, "%6lu ", nr_bufs);
+		}
+		spin_unlock_irqrestore(&zone->lock, flags);
+		seq_putc(m, '\n');
+	}
+	return 0;
+}
+
+struct seq_operations fragmentation_op = {
+	.start	= frag_start,
+	.next	= frag_next,
+	.stop	= frag_stop,
+	.show	= frag_show,
+};
+
+static char *vmstat_text[] = {
+	"nr_dirty",
+	"nr_writeback",
+	"nr_unstable",
+	"nr_page_table_pages",
+	"nr_mapped",
+	"nr_slab",
+
+	"pgpgin",
+	"pgpgout",
+	"pswpin",
+	"pswpout",
+	"pgalloc_high",
+
+	"pgalloc_normal",
+	"pgalloc_dma",
+	"pgfree",
+	"pgactivate",
+	"pgdeactivate",
+
+	"pgfault",
+	"pgmajfault",
+	"pgrefill_high",
+	"pgrefill_normal",
+	"pgrefill_dma",
+
+	"pgsteal_high",
+	"pgsteal_normal",
+	"pgsteal_dma",
+	"pgscan_kswapd_high",
+	"pgscan_kswapd_normal",
+
+	"pgscan_kswapd_dma",
+	"pgscan_direct_high",
+	"pgscan_direct_normal",
+	"pgscan_direct_dma",
+	"pginodesteal",
+
+	"slabs_scanned",
+	"kswapd_steal",
+	"kswapd_inodesteal",
+	"pageoutrun",
+	"allocstall",
+
+	"pgrotated",
+};
+
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+	struct page_state *ps;
+
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+
+	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	m->private = ps;
+	if (!ps)
+		return ERR_PTR(-ENOMEM);
+	get_full_page_state(ps);
+	ps->pgpgin /= 2;		/* sectors -> kbytes */
+	ps->pgpgout /= 2;
+	return (unsigned long *)ps + *pos;
+}
+
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+	return (unsigned long *)m->private + *pos;
+}
+
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+	unsigned long *l = arg;
+	unsigned long off = l - (unsigned long *)m->private;
+
+	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+	return 0;
+}
+
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+	kfree(m->private);
+	m->private = NULL;
+}
+
+struct seq_operations vmstat_op = {
+	.start	= vmstat_start,
+	.next	= vmstat_next,
+	.stop	= vmstat_stop,
+	.show	= vmstat_show,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int page_alloc_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	long *count;
+
+	if (action == CPU_DEAD) {
+		/* Drain local pagecache count. */
+		count = &per_cpu(nr_pagecache_local, cpu);
+		atomic_add(*count, &nr_pagecache);
+		*count = 0;
+		local_irq_disable();
+		__drain_pages(cpu);
+		local_irq_enable();
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void __init page_alloc_init(void)
+{
+	hotcpu_notifier(page_alloc_cpu_notify, 0);
+}
+
+static unsigned long higherzone_val(struct zone *z, int max_zone,
+					int alloc_type)
+{
+	int z_idx = zone_idx(z);
+	struct zone *higherzone;
+	unsigned long pages;
+
+	/* there is no higher zone to get a contribution from */
+	if (z_idx == MAX_NR_ZONES-1)
+		return 0;
+
+	higherzone = &z->zone_pgdat->node_zones[z_idx+1];
+
+	/* We always start with the higher zone's protection value */
+	pages = higherzone->protection[alloc_type];
+
+	/*
+	 * We get a lower-zone-protection contribution only if there are
+	 * pages in the higher zone and if we're not the highest zone
+	 * in the current zonelist.  e.g., never happens for GFP_DMA. Happens
+	 * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
+	 * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
+	 */
+	if (higherzone->present_pages && z_idx < alloc_type)
+		pages += higherzone->pages_low * sysctl_lower_zone_protection;
+
+	return pages;
+}
+
+/*
+ * setup_per_zone_protection - called whenver min_free_kbytes or
+ *	sysctl_lower_zone_protection changes.  Ensures that each zone
+ *	has a correct pages_protected value, so an adequate number of
+ *	pages are left in the zone after a successful __alloc_pages().
+ *
+ *	This algorithm is way confusing.  I tries to keep the same behavior
+ *	as we had with the incremental min iterative algorithm.
+ */
+static void setup_per_zone_protection(void)
+{
+	struct pglist_data *pgdat;
+	struct zone *zones, *zone;
+	int max_zone;
+	int i, j;
+
+	for_each_pgdat(pgdat) {
+		zones = pgdat->node_zones;
+
+		for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
+			if (zones[i].present_pages)
+				max_zone = i;
+
+		/*
+		 * For each of the different allocation types:
+		 * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
+		 */
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			/*
+			 * For each of the zones:
+			 * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
+			 */
+			for (j = MAX_NR_ZONES-1; j >= 0; j--) {
+				zone = &zones[j];
+
+				/*
+				 * We never protect zones that don't have memory
+				 * in them (j>max_zone) or zones that aren't in
+				 * the zonelists for a certain type of
+				 * allocation (j>i).  We have to assign these to
+				 * zero because the lower zones take
+				 * contributions from the higher zones.
+				 */
+				if (j > max_zone || j > i) {
+					zone->protection[i] = 0;
+					continue;
+				}
+				/*
+				 * The contribution of the next higher zone
+				 */
+				zone->protection[i] = higherzone_val(zone,
+								max_zone, i);
+				zone->protection[i] += zone->pages_low;
+			}
+		}
+	}
+}
+
+/*
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
+ *	that the pages_{min,low,high} values for each zone are set correctly 
+ *	with respect to min_free_kbytes.
+ */
+static void setup_per_zone_pages_min(void)
+{
+	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long lowmem_pages = 0;
+	struct zone *zone;
+	unsigned long flags;
+
+	/* Calculate total number of !ZONE_HIGHMEM pages */
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			lowmem_pages += zone->present_pages;
+	}
+
+	for_each_zone(zone) {
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		if (is_highmem(zone)) {
+			/*
+			 * Often, highmem doesn't need to reserve any pages.
+			 * But the pages_min/low/high values are also used for
+			 * batching up page reclaim activity so we need a
+			 * decent value here.
+			 */
+			int min_pages;
+
+			min_pages = zone->present_pages / 1024;
+			if (min_pages < SWAP_CLUSTER_MAX)
+				min_pages = SWAP_CLUSTER_MAX;
+			if (min_pages > 128)
+				min_pages = 128;
+			zone->pages_min = min_pages;
+		} else {
+			/* if it's a lowmem zone, reserve a number of pages 
+			 * proportionate to the zone's size.
+			 */
+			zone->pages_min = (pages_min * zone->present_pages) / 
+			                   lowmem_pages;
+		}
+
+		zone->pages_low = zone->pages_min * 2;
+		zone->pages_high = zone->pages_min * 3;
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+}
+
+/*
+ * Initialise min_free_kbytes.
+ *
+ * For small machines we want it small (128k min).  For large machines
+ * we want it large (16MB max).  But it is not linear, because network
+ * bandwidth does not increase linearly with machine size.  We use
+ *
+ *	min_free_kbytes = sqrt(lowmem_kbytes)
+ *
+ * which yields
+ *
+ * 16MB:	128k
+ * 32MB:	181k
+ * 64MB:	256k
+ * 128MB:	362k
+ * 256MB:	512k
+ * 512MB:	724k
+ * 1024MB:	1024k
+ * 2048MB:	1448k
+ * 4096MB:	2048k
+ * 8192MB:	2896k
+ * 16384MB:	4096k
+ */
+static int __init init_per_zone_pages_min(void)
+{
+	unsigned long lowmem_kbytes;
+
+	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
+
+	min_free_kbytes = int_sqrt(lowmem_kbytes);
+	if (min_free_kbytes < 128)
+		min_free_kbytes = 128;
+	if (min_free_kbytes > 16384)
+		min_free_kbytes = 16384;
+	setup_per_zone_pages_min();
+	setup_per_zone_protection();
+	return 0;
+}
+module_init(init_per_zone_pages_min)
+
+/*
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ *	that we can call two helper functions whenever min_free_kbytes
+ *	changes.
+ */
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+		struct file *file, void __user *buffer, size_t *length)
+{
+	proc_dointvec(table, write, file, buffer, length);
+	setup_per_zone_pages_min();
+	setup_per_zone_protection();
+	return 0;
+}
+
+/*
+ * lower_zone_protection_sysctl_handler - just a wrapper around
+ *	proc_dointvec() so that we can call setup_per_zone_protection()
+ *	whenever sysctl_lower_zone_protection changes.
+ */
+int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
+		 struct file *file, void __user *buffer, size_t *length)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length);
+	setup_per_zone_protection();
+	return 0;
+}
-- 
2.30.2